import pandas as pd
import numpy as np
import math
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
import visuals as vs
from mpl_toolkits.mplot3d import axes3d
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn import tree
from xgboost import XGBClassifier
% matplotlib inline
# Set Random Seed
np.random.seed(42)
np.random.RandomState(42)
# Read csv
data = pd.read_csv("data.csv")
data.head(10)
print("Number of data points :", len(data))
#shape command give no.of rows and no.of columns
data.shape
# Describe data
data.describe()
# print columns
data.columns
data.info()
# Save labels in y
y = data["diagnosis"]
We don't need pacient "id", "diagnosis" is our labels and "Unnamed: 32" have only NaNs. Let exclude this tree columns.
# Drop columns
X = data.drop(["id", "diagnosis", "Unnamed: 32"], axis=1)
# Plot a Correlation chart
corr = X.corr() # .corr is used for find corelation
#plt.figure(figsize=(20,15))
sns.set(rc={'figure.figsize':(25,20)})
# plot a heatmap
sns.heatmap(corr, cbar = True, square = True, annot=True, fmt= '.2f',annot_kws={'size': 10},
xticklabels= X.columns, yticklabels= X.columns,
cmap= 'coolwarm')
Features like (something)_mean, (something)_se, (something)_worst, have a natural correlation because all these are generated using same data, for example: to generate radius_mean, radius_se and radius_worst, radius mesuraments is used.
Radius, Perimeter and Area have stronge positive correlation
# Plot correlation between 2 features and distribution
sns.jointplot(X.loc[:,'radius_mean'],
X.loc[:,'area_mean'],
kind="scatter")
sns.jointplot(X.loc[:,'radius_mean'],
X.loc[:,'perimeter_mean'],
kind="regg")
sns.jointplot(X.loc[:,'area_mean'],
X.loc[:,'perimeter_mean'],
kind="scatter")
Radius have a strong positive correlation with Concave Points
# Plot correlation between 2 features and distribution
sns.jointplot(X.loc[:,'radius_mean'],
X.loc[:,'concave points_mean'],
kind="regg")
Compacteness, Concavity and Concave Points have strong positive correlation
# Plot correlation between 2 features and distribution
sns.jointplot(X.loc[:,'compactness_mean'],
X.loc[:,'concavity_mean'],
kind="regg")
sns.jointplot(X.loc[:,'compactness_mean'],
X.loc[:,'concave points_mean'],
kind="regg")
sns.jointplot(X.loc[:,'concavity_mean'],
X.loc[:,'concave points_mean'],
kind="regg")
Fractal Dimention have some negative correlation with Radius, Perimeter and Area
# Plot correlation between 2 features and distribution
sns.jointplot(X.loc[:,'fractal_dimension_mean'],
X.loc[:,'radius_mean'],
kind="regg")
sns.jointplot(X.loc[:,'fractal_dimension_mean'],
X.loc[:,'perimeter_mean'],
kind="regg")
sns.jointplot(X.loc[:,'fractal_dimension_mean'],
X.loc[:,'area_mean'],
kind="regg")
# Plot a countplot
sns.set(rc={'figure.figsize':(8,5)})
sns.countplot(y)
Data ins't balenced, there is more case of benigns tumors that malignant. Later we'll use methods to balance data and analyze if results get better.
# Print count
count = y.value_counts()
print 'Number of Benign : ',count[0]
print 'Number of Malignant : ',count[1]
Creating a Volume Mean Feature using radius_mean
# Creating a empty list
mean_volume = []
# defining pi
pi = 3.1415
# calculatin mean volume for each mean radius and saving result in mean_volume list
for i in range(len(X)):
#aving result in mean_volume list
mean_volume.append((math.pow(X["radius_mean"][i], 3)*4*pi)/3)
# Creating a new feature
X["mean_volume"]= mean_volume
Creating a simple new feature, measuraments_sum_mean just adding feature relatade with cell size
# Creating a new feature adding up some phisical measuraments
X["mesuraments_sum_mean"] = X["radius_mean"] + X["perimeter_mean"] + X["area_mean"]
X.head()
Since the range of values of raw data varies widely, in some machine learning algorithms, objective functions will not work properly without normalization. For example, the majority of classifiers calculate the distance between two points by the Euclidean distance. If one of the features has a broad range of values, the distance will be governed by this particular feature. Therefore, the range of all features should be normalized so that each feature contributes approximately proportionately to the final distance.
Another reason why feature scaling is applied is that gradient descent converges much faster with feature scaling than without it.[1]
# Define a scaler function
def scaler(df):
"""The Function receive a Dataframe and return a Scaled Dataframe"""
scaler = preprocessing.MinMaxScaler()
scaled_df = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled_df, columns=df.columns)
return scaled_df
# testing scaler
scaled_df = scaler(X)
scaled_df.head()
# Preparing data
data_plot = pd.concat([y,scaled_df],axis=1)
data_plot = pd.melt(data_plot,id_vars="diagnosis",
var_name="features",
value_name='value')
# Plot a violinplot
sns.set(rc={'figure.figsize':(15,30)})
sns.violinplot(x="value", y="features", hue="diagnosis", data=data_plot,split=True, inner="quart")
# Ploting a pairplot Grid
sns.set(style="white")
df = scaled_df.iloc[:,0:9]
g = sns.PairGrid(df, diag_sharey=False)
g.map_lower(sns.kdeplot, cmap="Blues_d")
g.map_upper(plt.scatter)
g.map_diag(sns.kdeplot, lw=3)
# Plot a Swarmplot
sns.set(style="whitegrid", palette="muted")
data_plot = scaled_df
data_plot = pd.concat([y,data_plot.iloc[:,0:]],axis=1)
data_plot = pd.melt(data_plot,id_vars="diagnosis",
var_name="features",
value_name='value')
#plt.figure(figsize=(10,10))
sns.set(rc={'figure.figsize':(15,30)})
sns.swarmplot(x="value", y="features", hue="diagnosis", data=data_plot)
# Define a function to detect outliers
def remove_outliers(X, y, f=2, distance=1.5):
"""The Function receive Features (X) and Label (y) a frequency (f) and Inter-Quartile distance (distance),
and return features and labels without outliers (good_X, good_y)"""
outliers = []
# For each feature find the data points with extreme high or low values
for feature in X.keys():
# Calculate Q1 (25th percentile of the data) for the given feature
Q1 = np.percentile(X[feature], 25)
# Calculate Q3 (75th percentile of the data) for the given feature
Q3 = np.percentile(X[feature], 75)
# Use the interquartile range to calculate an outlier step (1.5 times the interquartile range)
step = (Q3 - Q1) * distance
outliers.append(X[~((X[feature] >= Q1 - step) & (X[feature] <= Q3 + step))].index.values)
# Select the indices for data points you wish to remove
flat_list = [item for sublist in outliers for item in sublist]
# importing Counter
from collections import Counter
freq = Counter(flat_list)
# Create a list to store outliers to remove
outliers_to_remove = []
for key, value in freq.iteritems():
if value > f:
outliers_to_remove.append(key)
# Remove the outliers, if any were specified
good_X = X.drop(X.index[outliers_to_remove]).reset_index(drop = True)
good_y = y.drop(y.index[outliers_to_remove]).reset_index(drop = True)
# Sort list
outliers_to_remove.sort()
# Print outliers founded
for i in range(len(outliers_to_remove)):
print "data point: ", outliers_to_remove[i], "is considered outlier to more than ", f, " feature"
print "All ", len(outliers_to_remove), "were removed!"
# return data without outliers
return good_X, good_y
good_X, good_y = remove_outliers(scaled_df, y, f=2, distance=1.5)
good_X.head()
sns.set(rc={'figure.figsize':(8,5)})
sns.countplot(good_y)
count = y.value_counts()
count2 = good_y.value_counts()
print 'Number of Benign removed: ',count[0] - count2[0]
print 'Number of Malignant removed: ',count[1] - count2[1]
Many malignant were considered outilier, this make data even more unbalanced. Later we'll understand if remove outlier improve results in this Dataset.
# TODO: Apply PCA by fitting the good data with only two dimensions
pca = PCA(n_components=2).fit(good_X)
# TODO: Transform the good data using the PCA fit above
reduced_data = pca.transform(good_X)
# TODO: Transform log_samples using the PCA fit above
pca_samples = pca.transform(good_X)
# Generate PCA results plot
pca_results = vs.pca_results(good_X, pca)
print "Cumulative explained variance:"
print pca_results['Explained Variance'].cumsum()
pca_df = pd.DataFrame(pca_samples, columns=["d1", "d2"])
data_plot = pd.concat([good_y,pca_df.iloc[:,0:]],axis=1)
sns.lmplot(x="d1", y="d2", hue="diagnosis", data=data_plot, markers=["x", "o"], fit_reg=False)
# TODO: Apply PCA by fitting the good data with the same number of dimensions as features
pca = PCA(n_components=3)
pca.fit(good_X)
# TODO: Transform log_samples using the PCA fit above
pca_samples = pca.transform(good_X)
pca_df = pd.DataFrame(pca_samples, columns=["d1", "d2", "d3"])
data_plot = pd.concat([good_y,pca_df.iloc[:,0:]],axis=1)
data_plot.head()
# plot
fig = plt.figure(figsize=(20,12))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(data_plot['d1'], data_plot['d2'], data_plot['d3'], s=100)
#ax.view_init(30, 185)
plt.show()
def pca(X, n_components=3):
"""The function receive features (X), and a target number of components (n_components),
and return a PCA transformed with n_components dimentions"""
pca = PCA(n_components)
pca.fit(X)
# TODO: Transform log_samples using the PCA fit above
pca_samples = pca.transform(X)
return pd.DataFrame(pca_samples)
pca_df = pca(X, 3)
pca_df.head()
Runing PCA experiments I realized that is necessary many dimensions (or PCA components) to explain data variance, so I decided do not use PCA.
Naive random over-sampling
One way to fight this issue is to generate new samples in the classes which are under-represented. The most naive strategy is to generate new samples by randomly sampling with replacement the current available samples. The RandomOverSampler offers such scheme:
from imblearn.over_sampling import RandomOverSampler
X_resampled, y_resampled = RandomOverSampler().fit_sample(X, y)
from collections import Counter
print(sorted(Counter(y_resampled).items()))
sns.set(rc={'figure.figsize':(8,5)})
sns.countplot(y_resampled)
From random over-sampling to SMOTE and ADASYN
Apart from the random sampling with replacement, there is two popular methods to over-sample minority classes: (i) Synthetic Minority Oversampling Technique (SMOTE) and (ii) Adaptive Synthetic (ADASYN) sampling method. These algorithm can be used in the same manner:
from imblearn.over_sampling import SMOTE, ADASYN
X_resampled, y_resampled = SMOTE().fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))
sns.set(rc={'figure.figsize':(8,5)})
sns.countplot(y_resampled)
X_resampled, y_resampled = ADASYN().fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))
sns.set(rc={'figure.figsize':(8,5)})
sns.countplot(y_resampled)
# Define a function to rebalance data
def resample(X,y, method="RandomOverSampler"):
"""The function receive features and labels (X, y) and a method to balance data
available methods RandomOverSampler, ADASYN, SMOTE
The funcion returns X_resampled, y_resampled"""
if method == "RandomOverSampler":
X_resampled, y_resampled = RandomOverSampler().fit_sample(X, y)
if method == "ADASYN":
X_resampled, y_resampled = ADASYN().fit_sample(X, y)
else:
X_resampled, y_resampled = SMOTE().fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))
X_resampled = pd.DataFrame(X_resampled, columns=X.columns)
return X_resampled, y_resampled
# choose between RandomOverSampler, ADASYN, SMOTE
X_resampled, y_resampled = resample(X,y, "SMOTE")
sns.set(rc={'figure.figsize':(8,5)})
sns.countplot(y_resampled)
Feature selection works by selecting the best features based on univariate statistical tests. It can be seen as a preprocessing step to an estimator. SelectKBest removes all but the k highest scoring features
def selector(X, y, k=12):
"""The function receive features and labels (X, y) and a target number to select features (k)
and return a new dataset wiht k best features"""
selector = SelectKBest(chi2, k)
X_new = selector.fit_transform(X, y)
return pd.DataFrame(X_new, columns=X.columns[selector.get_support()])
X_new = selector(X, y, 5)
X_new.head()
data = pd.read_csv("data.csv")
y = data["diagnosis"]
X = data.drop(["id", "diagnosis", "Unnamed: 32"], axis=1)
mean_volume = []
pi = 3.1415
for i in range(len(X)):
mean_volume.append((math.pow(X["radius_mean"][i], 3)*4*pi)/3)
X["mean_volume"]= mean_volume
X["mesuraments_sum_mean"] = X["radius_mean"] + X["perimeter_mean"] + X["area_mean"]
X.shape
Exhaustive search over specified parameter values for an estimator. The parameters of the estimator used to apply these methods are optimized by cross-validated grid-search over a parameter grid.
# Random Forest Classifier
RF_clf = RandomForestClassifier()
# Parameters to tune
RF_par = {"max_depth": [3, None], "max_features": [1, 3, 10], "min_samples_split": [2, 3, 10],
"min_samples_leaf": [1, 3, 10], "bootstrap": [True, False], "criterion": ["gini", "entropy"]}
# Extra Trees Classifier
XT_clf = ExtraTreesClassifier()
# Parameters to tune
XT_par = { 'n_estimators': [5, 10, 16], "min_samples_split": [2, 3, 10], "min_samples_leaf": [1, 3, 10]}
# Decision Tree Classifier
DT_clf =DecisionTreeClassifier()
# Parameters to tune
DT_par = { 'splitter': ['best', ], "min_samples_split": [2, 3, 10], "min_samples_leaf": [1, 3, 10]}
# Support Vector Machine Classifier
SV_clf = svm.SVC()
# Parameters to tune
SV_par = {'kernel': ['rbf'], 'C': [1]}
# AdaBoost Classifier
AD_clf = AdaBoostClassifier()
# Parameters to tune
AD_par = {'n_estimators':[10, 20, 50, 60], 'learning_rate':[0.1, 0.5, 1.0, 1.5], 'algorithm':['SAMME.R', 'SAMME']}
# Gradient Boosting Classifier
GB_clf = GradientBoostingClassifier()
# Parameters to tune
GB_par = {'loss':['deviance', 'exponential'], 'learning_rate':[0.01, 0.1, 0.5, 1.0], 'n_estimators':[50, 100, 150],
"min_samples_split": [2, 3], "min_samples_leaf": [1, 3], 'max_depth':[2, 3, 5]}
# SGD Classifier
SG_clf = SGDClassifier()
# Parameters to tune
SG_par = {'loss':['hinge', 'log', 'squared_hinge', 'perceptron'], 'penalty':['l2', 'l1'],
'alpha':[0.00001, 0.0001, 0.001], 'epsilon':[0.01, 0.1, 0.5]}
# Logistic Regression
LR_clf = LogisticRegression()
# Parameters to tune
LR_par= {'penalty':['l1','l2'], 'C': [0.5, 1, 5, 10], 'max_iter':[50, 100, 150, 200]}
# XGB Classifier
XB_clf = XGBClassifier()
# Parameters to tune
XB_par = {'max_depth':[2, 3, 5], 'learning_rate':[0.01, 0.1, 0.5, 1], 'n_estimators':[50, 100, 150, 200], 'gamma':[0, 0.001, 0.01, 0.1]}
classifiers = [RF_clf, XT_clf, DT_clf, SV_clf, AD_clf, GB_clf, SG_clf, LR_clf, XB_clf]
classifiers_names = ['Random Forest ', 'Extra DecisionTrees', 'Decision Tree ',
'Support Vector ', 'AdaBoost Classifier', 'Gradient Boosting ',
'SGD Classifier ', 'Logistic Regression', 'XGB Classifier ']
parameters = [RF_par, XT_par, DT_par, SV_par, AD_par, GB_par, SG_par, LR_par, XB_par]
def tune_compare_clf(X, y, classifiers, parameters, classifiers_names):
'''The function receive Data (X, y), a classifiers list,
a list of parameters to tune each chassifier (each one is a dictionary),
and a list with classifiers name.
The function split data in Train and Test data,
train and tune all algorithms and print results using F1 score.
The function also returns a Dataframe with predictions, each row is a classifier prediction,
and X_test and y_test.
'''
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
print "\n" "Train size : ", X_train.shape, " and Train labels : ", y_train.shape, "\n"
print "Test size: ", X_test.shape, " and Test labels : ", y_test.shape, "\n", "\n"
results = []
print " ---- F1 Score ---- ", "\n"
for clf, par, name in itertools.izip(classifiers, parameters, classifiers_names):
# Store results in results list
clf_tuned = GridSearchCV(clf, par).fit(X_train, y_train)
y_pred = clf_tuned.predict(X_test)
results.append(y_pred)
print name, ": %.2f%%" % (f1_score(y_test, y_pred, average='weighted') * 100.0)
result = pd.DataFrame.from_records(results)
return result, X_test, y_test
result, X_test, y_test = tune_compare_clf(X, y, classifiers, parameters, classifiers_names)
Store all classifiers predictions, each column is a data point and rows is prediction of each classifiers, we can use describe() function to undestand what is more common prediction for each data point, this way we're colecting the "votes" for each data point.
y_pred_votes = result.describe().iloc[[2]]
y_pred_votes
print("Accuracy: %.2f%%" % (f1_score(y_test, y_pred_votes.T, average='weighted') * 100.0))
sns.set(rc={'figure.figsize':(5,5)})
cm = confusion_matrix(y_test,y_pred_votes.T)
sns.heatmap(cm,annot=True,fmt="d")
Before in this project we define and test differents aproachs to use our original dataset, and create some functions:
scaler(X)
selector(X, y, k)
remove_outliers(X, y, f, distance)
resample(X, y, method)
Now we'll test using:
With Technique or set of techniques is more effective for this dataset to minimize error when classifies Breast Cancer.
y = data["diagnosis"]
X = data.drop(["id", "diagnosis", "Unnamed: 32"], axis=1)
mean_volume = []
pi = 3.1415
for i in range(len(X)):
mean_volume.append((math.pow(X["radius_mean"][i], 3)*4*pi)/3)
X["mean_volume"]= mean_volume
X["mesuraments_sum_mean"] = X["radius_mean"] + X["perimeter_mean"] + X["area_mean"]
result, X_test, y_test = tune_compare_clf(X, y, classifiers, parameters, classifiers_names)
X_scaled = scaler(X)
result, X_test, y_test = tune_compare_clf(X_scaled, y, classifiers, parameters, classifiers_names)
X_good, y_good = remove_outliers(X, y, f=2, distance=2)
result, X_test, y_test = tune_compare_clf(X_good, y_good, classifiers, parameters, classifiers_names)
X_selected = selector(X, y, k=12)
result, X_test, y_test = tune_compare_clf(X_selected, y, classifiers, parameters, classifiers_names)
X_selected = selector(X, y, 10)
result, X_test, y_test = tune_compare_clf(X_selected, y, classifiers, parameters, classifiers_names)
X_new, y_new = resample(X, y, method="RandomOverSampler")
result, X_test, y_test = tune_compare_clf(X_new, y_new, classifiers, parameters, classifiers_names)
X_new, y_new = resample(X, y, method="SMOTE")
result, X_test, y_test = tune_compare_clf(X_new, y_new, classifiers, parameters, classifiers_names)
X_new, y_new = resample(X, y, method="ADASYN")
result, X_test, y_test = tune_compare_clf(X_new, y_new, classifiers, parameters, classifiers_names)
X_scaled = scaler(X)
X_good, y_good = remove_outliers(X_scaled, y, f=2, distance=2)
result, X_test, y_test = tune_compare_clf(X_good, y_good, classifiers, parameters, classifiers_names)
X_scaled = scaler(X)
X_new, y_new = resample(X_scaled, y, method="RandomOverSampler")
result, X_test, y_test = tune_compare_clf(X_new, y_new, classifiers, parameters, classifiers_names)
X_scaled = scaler(X)
X_good, y_good = remove_outliers(X_scaled, y, f=2, distance=2)
X_new, y_new = resample(X_good, y_good, method="RandomOverSampler")
result, X_test, y_test = tune_compare_clf(X_new, y_new, classifiers, parameters, classifiers_names)
X_selected = selector(X, y, 10)
X_scaled = scaler(X_selected)
X_good, y_good = remove_outliers(X_scaled, y, f=2, distance=2)
X_new, y_new = resample(X_good, y_good, method="RandomOverSampler")
result, X_test, y_test = tune_compare_clf(X, y, classifiers, parameters, classifiers_names)
| Classifier | Original | Scaled | Outiliers Removed | 12 Features | 10 Features | Resampled Randon | SMOTE | ADASYN |
|---|---|---|---|---|---|---|---|---|
| Random Forest | 95.60% | 96.47% | 94.23% | 94.68% | 95.58% | 96.50% | 96.50% | 96.52% |
| Extra DecisionTrees | 96.47% | 95.60% | 96.14% | 96.47% | 98.24% | 99.30% | 97.90% | 95.83% |
| Decision Tree | 94.74% | 94.74% | 93.22% | 95.58% | 94.74% | 97.20% | 93.70% | 94.44% |
| Support Vector | 47.80% | 96.45% | 50.05% | 47.80% | 47.80% | 37.32% | 37.32% | 44.34% |
| AdaBoost Classifier | 96.49% | 96.49% | 96.12% | 95.60% | 94.71% | 97.90% | 96.50% | 95.83% |
| Gradient Boosting | 94.74% | 94.74% | 94.21% | 92.95% | 95.60% | 96.50% | 96.50% | 96.52% |
| SGD Classifier | 87.42% | 97.36% | 90.16% | 76.39% | 24.34% | 73.29% | 61.44% | 74.61% |
| Logistic Regression | 96.47% | 97.36% | 98.06% | 95.58% | 97.35% | 98.60% | 98.60% | 97.92% |
| XGB Classifier | 96.47% | 96.47% | 96.16% | 96.49% | 94.74% | 95.80% | 95.80% | 95.83% |
| Classifier | Scaled + Outiliers Removed | Scaled + Resampled | Scaled + Outiliers Removed+Resampled | Scaled + Feature + Out. Rem + Resampled |
|---|---|---|---|---|
| Random Forest | 96.14% | 97.90% | 97.83% | 96.47% |
| Extra DecisionTrees | 94.21% | 96.50% | 97.10% | 96.47% |
| Decision Tree | 91.24% | 97.20% | 94.20% | 94.74% |
| Support Vector | 98.05% | 95.81% | 97.83% | 47.80% |
| AdaBoost Classifier | 96.14% | 95.11% | 97.10% | 96.49% |
| Gradient Boosting | 95.20% | 96.50% | 99.28% | 95.60% |
| SGD Classifier | 98.05% | 93.68% | 97.83% | 85.07% |
| Logistic Regression | 97.08% | 95.80% | 97.83% | 96.47% |
| XGB Classifier | 96.16% | 97.20% | 98.55% | 96.47% |
The highest and more consistents results across all classifiers was achieved using Scale, Removing Outiliers and Balancing data. The higher score was reached by Gradient Boosting Algorithm (F1 Score = 99.28) and lower was Decision Tree (F1 Score 94,20) which even been the lower is a very descent result.
# Scale, Outliers Remove and Resample
X_scaled = scaler(X)
X_good, y_good = remove_outliers(X_scaled, y, f=2, distance=2)
X_new, y_new = resample(X_good, y_good, method="RandomOverSampler")
result, X_test, y_test = tune_compare_clf(X_new, y_new, classifiers, parameters, classifiers_names)
y_pred_votes = result.describe().iloc[[2]]
print("Accuracy: %.2f%%" % (f1_score(y_test, y_pred_votes.T, average='weighted') * 100.0))
sns.set(rc={'figure.figsize':(5,5)})
cm = confusion_matrix(y_test,y_pred_votes.T)
sns.heatmap(cm,annot=True,fmt="d")

Using F1 Score formula:

In statistical analysis of binary classification, the F1 score (also F-score or F-measure) is a measure of a test's accuracy. It considers both the precision p and the recall r of the test to compute the score: p is the number of correct positive results divided by the number of all positive results returned by the classifier, and r is the number of correct positive results divided by the number of all relevant samples (all samples that should have been identified as positive). The F1 score is the harmonic average of the precision and recall, where an F1 score reaches its best value at 1 (perfect precision and recall) and worst at 0.